import excel "../Raw/GCEL_Company_list/GCEL_Company_list.xlsx", clear first sheet("Output")
duplicates drop
drop A csr20 cspp20 csr30 cspp30 Z
rename Q CoalSharePowerBasedGenCap

//clean up name variables very lightly 
replace Company = strtrim(Company)
replace ParentCompany = strtrim(ParentCompany)
replace CoalIndustrySector = "Power, Services, Mining" if GCELcompanyID == 524
replace subsidiariesaffiliatesjointv = 12 if GCELcompanyID == 524 
replace CoalIndustrySector = "Power, Mining, Services (finance)" if GCELcompanyID == 670
duplicates drop 
duplicates report GCELcompanyID ParentCompany

replace subsidiariesaffiliatesjointv = . if GCELcompanyID == 6300 //Fixing mistake in parent designation
gen parentID_test = GCELcompanyID if missing(subsidiariesaffiliatesjointv)
order GCELcompanyID parentID_test
replace parentID_test = parentID_test[_n-1] if ~missing(subsidiariesaffiliatesjointv)

gen parent_clean = lower(ParentCompany)
replace parent_clean = subinstr(parent_clean,".","",.)

//manual fixes 
replace parentID_test = 294 if GCELcompanyID == 6328
replace parentID_test = 1507 if ParentCompany == "Jindal Steel & Power Ltd"
replace parentID_test = . if inlist(GCELcompanyID,1480,6093,1627,1742)
egen npID = nvals(parentID_test), by(parent_clean)
egen npName = nvals(parent_clean), by(parentID_test)

//for missing parents, use their own ID number 
replace parentID_test = GCELcompanyID if missing(parentID_test)

//Get rid of a duplicate observation 
drop if GCELcompanyID == 2927
duplicates report Company ParentCompany

//look for companies with multiple IDs across different parents 
egen nID = nvals(GCELcompanyID), by(Company)
replace GCELcompanyID = 816 if GCELcompanyID == 2680
replace GCELcompanyID = 1870 if GCELcompanyID == 2802
drop nID

tempfile gcel 
save `gcel'

//NOW GET PARENTS 
use `gcel'
order Company ParentCompany GCELcompanyID
gen likelyparent = subsidiariesaffiliatesjointv == .
gen namesame = Company == ParentCompany
gen isParent = subsidiariesaffiliatesjointv == . | Company == ParentCompany

keep if subsidiariesaffiliatesjointv == .


use `gcel', clear
keep Company GCELcompanyID
duplicates drop
rename GCELcompanyID ParentcompanyID 
rename Company ParentCompany
replace ParentCompany = strtrim(ParentCompany)
tempfile parent 
save `parent'

use `gcel', clear 
replace ParentCompany = strtrim(ParentCompany)
merge m:1 ParentCompany using `parent'

import excel "../Raw/GCEL_Company_list/GCEL_Company_list.xlsx", clear first sheet("Output")
duplicates drop
drop A csr20 cspp20 csr30 cspp30 Z
rename Q CoalSharePowerBasedGenCap
replace Company = strtrim(Company)
replace ParentCompany = strtrim(ParentCompany)
replace CoalIndustrySector = "Power, Services, Mining" if GCELcompanyID == 524
replace subsidiariesaffiliatesjointv = 12 if GCELcompanyID == 524 
replace CoalIndustrySector = "Power, Mining, Services (finance)" if GCELcompanyID == 670
duplicates drop 
duplicates report GCELcompanyID ParentCompany
keep if missing(subsidiariesaffiliatesjointv) //keep parents only
drop subsidiariesaffiliatesjointv
drop Company
order GCELcompanyID ParentCompany
rename GCELcompanyID ParentID_GCEL 
gen expansion_infrastructure = regexm(expansionplans,"infrastructure")
gen expansion_mining = regexm(expansionplans,"mining")
gen expansion_power = regexm(expansionplans,"power")
drop expansionplans 
gen coal_industry_power = regexm(CoalIndustrySector,"Power")
gen coal_industry_services = regexm(CoalIndustrySector,"Services")
gen coal_industry_mining = regexm(CoalIndustrySector,"Mining")
drop CoalIndustrySector

drop ExpansionPlansCoalPowerCoun AnnualCoalProductionCountrie ExpansionCoalMiningCountries
drop CompanyWebsite CompanyReports InstalledCoalPowerCapacityC

rename CountryofHeadquarters Country
rename CoalShareofPowerProduction CoalSharePower
rename CoalShareofRevenue CoalFracRev

// Cleaning CoalSharePowerParent
replace CoalSharePower = subinstr(CoalSharePower,">","",.)
replace CoalSharePower = subinstr(CoalSharePower,"<","",.)
replace CoalSharePower = subinstr(CoalSharePower,"%","",.)

// Cleaning CoalFracRevParent
replace CoalFracRev = subinstr(CoalFracRev,">","",.)
replace CoalFracRev = subinstr(CoalFracRev,"<","",.)
replace CoalFracRev = subinstr(CoalFracRev,"%","",.)

destring CoalFracRev, replace ignore("NA" "NI" "/")
destring CoalSharePower, replace ignore("NA" "NI" "/")
replace CoalFracRev = CoalFracRev/100 if CoalFracRev>1
replace CoalSharePower = CoalSharePower/100 if CoalSharePower > 1


//custom cleaning
replace AnnualCoalProductioninmill = "" if ParentID_GCEL == 2136
replace AnnualCoalProductioninmill = "6" if AnnualCoalProductioninmill == "<6"
replace AnnualCoalProductioninmill = "75.7" if AnnualCoalProductioninmill == ">75,7"


//cleaning other numeric variables 
foreach var of var ExpansionPlansCoalPowerPro ExpansionPlansCoalPowerTot InstalledCoalPowerCapacity AnnualCoalProductioninmill {
	replace `var' = subinstr(`var',">","",.)
	destring `var', replace ignore("NA" "NI" "/")
}


// Getting ContinentParent variable
kountry Country, from(other) geo(marc)
rename GEO Continent
drop NAMES_STD
replace Continent = "Asia" if Country == "China (Hong Kong)" | Country == "Taiwan"
replace Continent = "Europe" if Country == "Jersey"
order Continent, after(Country)

save ../Intermediate/GCEL_parent_clean, replace

